In [1]:
    
import pandas as pd
import numpy as np
from sklearn import * 
import matplotlib.pyplot as plt
%matplotlib inline
    
In [2]:
    
df = pd.read_csv("/data/credit-default.csv")
df.head()
    
    Out[2]:
In [3]:
    
df.info()
    
    
In [4]:
    
df.default.value_counts()
    
    Out[4]:
In [5]:
    
target = "default"
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(df[target])
X = df.drop(columns=[target])
X.head()
    
    Out[5]:
In [6]:
    
cat_columns = [field for field in dict(X.dtypes) 
               if X.dtypes[field] == "object"]
cat_columns
    
    Out[6]:
In [7]:
    
num_columns = [field for field in dict(X.dtypes) 
               if X.dtypes[field] != "object"]
num_columns
    
    Out[7]:
In [8]:
    
cat_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='constant'
                                     , fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='error'
                                           , drop="first"))
]) 
num_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('poly', preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ('scaler', preprocessing.StandardScaler()),
])
preprocessing_pipe = compose.ColumnTransformer([
    ("cat", cat_pipe, cat_columns),
    ("num", num_pipe, num_columns)
])
    
In [9]:
    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", linear_model.LogisticRegression(random_state=1
                                , solver="liblinear"))
])
param_grid = {
    "est__C": np.random.random(10) + 1
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_
        , "Best parameters: ", gsearch.best_params_)
    
    
    
    
    
In [10]:
    
log_clf = linear_model.LogisticRegression(C = 1.53
                            , solver= "liblinear", random_state=1) 
rnd_clf = ensemble.RandomForestClassifier(max_depth=6
                            , n_estimators = 30, random_state=1) 
svm_clf = svm.SVC(C = 1.0, gamma = 0.15, random_state=1) 
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.VotingClassifier(voting="hard", estimators=
                                      [('lr', log_clf), 
                                       ('rf', rnd_clf), 
                                       ('svm', svm_clf)
                                      ])
    )
])
param_grid = {
    "est__svm__C": np.linspace(1.0, 20, 10)
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                    , verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)
    
    
    
    
In [11]:
    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.AdaBoostClassifier(
          linear_model.LogisticRegression(random_state=1
                                          , solver="liblinear")
        , n_estimators=200
        , algorithm="SAMME.R"
        , learning_rate=0.051)
    )
])
param_grid = {
    "est__base_estimator__C": np.random.random(10) + 1
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                                , verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)
    
    
    
    
    
In [12]:
    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.BaggingClassifier(
                tree.DecisionTreeClassifier(), 
                max_samples= 0.5,
                n_estimators=50,
                bootstrap=True, 
                oob_score=True)
    )
])
param_grid = {
    "est__base_estimator__max_depth": np.arange(5, 15)
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)
    
    
    
    
    
In [18]:
    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.GradientBoostingClassifier(random_state=1))
])
param_grid = {
    "est__max_depth": np.arange(3, 10),
    "est__learning_rate": np.linspace(0.01, 1, 10)
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_
        , "Best parameters: ", gsearch.best_params_)
    
    
    
    
    
In [23]:
    
scores = pd.DataFrame(gsearch.cv_results_)
scores.head()
    
    Out[23]:
In [24]:
    
scores[scores.rank_test_score == 1]
    
    Out[24]:
In [ ]: